Abstract:

This is a technical blog post of both an HTML file and .qmd file hosted on GitHub pages.

0. Quarto Type-setting

1. Setup

Step Up Code:

sh <- suppressPackageStartupMessages
sh(library(tidyverse))
sh(library(caret))
wine <- readRDS(gzcon(url("https://github.com/cd-public/D505/raw/master/dat/pinot.rds")))

2. Conditional Probability

Calculate the probability that a Pinot comes from Burgundy given it has the word ‘fruit’ in the description.

\[ P({\rm Burgundy}~|~{\rm Fruit}) \]

## Creating Fruit Feature
wino <- wine %>%
  mutate(fruit = str_detect(description, "fruit"))

wino
## Conditional Probability
burgundy_and_fruit <- nrow(filter(wino, province == "Burgundy" & fruit)) / nrow(wino)

fruit <- nrow(filter(wino, fruit)) / nrow(wino)

burgundy_and_fruit / fruit
## [1] 0.2196038

3. Naive Bayes Algorithm

We train a naive bayes algorithm to classify a wine’s province using: 1. An 80-20 train-test split. 2. Three features engineered from the description 3. 5-fold cross validation.

We report Kappa after using the model to predict provinces in the holdout sample.

## Searching for Common Words
library(tidytext)
wine_desc <- wine %>%
  unnest_tokens(word, description) %>%
  count(word) %>%
  arrange(desc(n))

wine_desc
## acidity, cherry, tart
## Creating 3 Features
wino <- wine %>%
  mutate(cherry = str_detect(description, "cherry")) %>%
  mutate(tart = str_detect(description, "tart")) %>%
  mutate(acidity = str_detect(description, "acidity"))

wino
## 80-20 Split
set.seed(5)

wine_index <- createDataPartition(wino$province, p = 0.8, list = FALSE)
train <- wino[wine_index, ]
test <- wino[-wine_index, ]
## 5-Fold Cross Validation
train_control <- trainControl(method = "cv", number = 5)

fit <- train(province ~., 
             data = train,
             method = "naive_bayes",
             metric = "Kappa",
             trControl = train_control)

I tried at least 20 combinations of 3 features and all the kappa values weren’t good.

4. Frequency Differences

We find the three words that most distinguish New York Pinots from all other Pinots.

## Filtering for Pinots
pinots <- wine %>%
  filter(str_detect(description, "(?i)pinot"))
pinots
## Finding Popular Terms for Pinots
pinots_words <- wine %>%
  unnest_tokens(word, description) %>%
  filter(nchar(word) > 3) %>%
  count(word) %>%
  arrange(desc(n))

pinots_words
pinots_words_freq <- pinots_words %>%
  mutate(pinot_freq = n/sum(n))

pinots_words_freq
## Filtering for New York Pinots
ny_pinots <- pinots %>%
  filter(province == "New_York")
ny_pinot_words <- ny_pinots %>%
  unnest_tokens(word, description) %>%
  filter(nchar(word) > 3) %>%
  count(word) %>%
  arrange(desc(n))

ny_pinot_words
ny_pinot_words_freq <- ny_pinot_words %>%
  mutate(ny_freq = n/sum(n))

ny_pinot_words_freq
word_freq_diff <- full_join(pinots_words_freq, ny_pinot_words_freq, by = "word") %>%
  mutate(freq_diff = pinot_freq - ny_freq) %>%
  arrange(freq_diff)

word_freq_diff

In terms of descriptive words cherry, tannins, and acidity have the biggest difference in terms of description between regular pinos and New York pinos.